{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ridge Regression\n",
    "\n",
    "#### Author: Yiran Jing\n",
    "\n",
    "#### Date: Feb 2020\n",
    "\n",
    "\n",
    "## Contents:\n",
    "1. Model selection\n",
    "   - Linear regression\n",
    "   - Ridge regression\n",
    "   - Lasso regression\n",
    "   \n",
    "Lasso performs best using `Days`, `suspected`, `confirmed_lag1` as regressors\n",
    "\n",
    "2. Ridge Residual Dignostic\n",
    "\n",
    "3. Model prediction\n",
    "\n",
    "## Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import operator\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import pandas\n",
    "import datetime\n",
    "import seaborn as sns\n",
    "import matplotlib.dates as mdates\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "# Import models from scikit learn module\n",
    "from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, LinearRegression, ElasticNetCV\n",
    "from helper_fun_model import *\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2020-02-16 14:14:52Update records successfully to ../data/DXY_Chinese.csv\n",
      "Save area daily dataset (English) into ../data/DXYArea.csv\n",
      "CPU times: user 374 ms, sys: 109 ms, total: 483 ms\n",
      "Wall time: 28.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "## Update data from DXY\n",
    "! cd ../data_processing && python DXY_AreaData_query.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Load data \n",
    "df = pd.read_csv(\"../data/DXYArea.csv\")\n",
    "\"\"\"\n",
    "Data Cleaning \n",
    "\"\"\"\n",
    "df['date'] = pd.to_datetime(df['date'])\n",
    "df = df[df['date'] != df['date'].max()] # remove todays' records (since it can be incompleted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df['date']>'2020-01-14']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Case 1: Overall China"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "china_df = get_China_total(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#features_to_engineer = ['confirmed', 'suspected']\n",
    "features_to_engineer = ['confirmed'] # only calculate lag 1 for predictor, not regressor."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "china_df = feature_engineering(china_df, features_to_engineer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>country</th>\n",
       "      <th>countryCode</th>\n",
       "      <th>province</th>\n",
       "      <th>city</th>\n",
       "      <th>confirmed</th>\n",
       "      <th>suspected</th>\n",
       "      <th>cured</th>\n",
       "      <th>dead</th>\n",
       "      <th>Days</th>\n",
       "      <th>net_confirmed</th>\n",
       "      <th>confirmed_lag1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7126</th>\n",
       "      <td>2020-02-14</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>66575</td>\n",
       "      <td>8969</td>\n",
       "      <td>8101</td>\n",
       "      <td>1524</td>\n",
       "      <td>68</td>\n",
       "      <td>56950</td>\n",
       "      <td>63932.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7471</th>\n",
       "      <td>2020-02-15</td>\n",
       "      <td>中国</td>\n",
       "      <td>CN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>68584</td>\n",
       "      <td>8229</td>\n",
       "      <td>9425</td>\n",
       "      <td>1666</td>\n",
       "      <td>69</td>\n",
       "      <td>57493</td>\n",
       "      <td>66575.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           date country countryCode province city  confirmed  suspected  \\\n",
       "7126 2020-02-14      中国          CN      NaN  NaN      66575       8969   \n",
       "7471 2020-02-15      中国          CN      NaN  NaN      68584       8229   \n",
       "\n",
       "      cured  dead  Days  net_confirmed  confirmed_lag1  \n",
       "7126   8101  1524    68          56950         63932.0  \n",
       "7471   9425  1666    69          57493         66575.0  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "china_df.tail(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train dataset: data before 2020-02-10 00:00:00 \n",
      "Test dataset: the last 5 days\n"
     ]
    }
   ],
   "source": [
    "Train, Test = split_train_test_by_date(china_df, 5) \n",
    "\n",
    "regressors = ['Days', 'suspected'] # other variables we want to apply into model\n",
    "#regressors = ['Days']\n",
    "\n",
    "\n",
    "X_train = Train.loc[:,regressors + [x+'_lag1' for x in features_to_engineer]]\n",
    "y_train = Train['confirmed']\n",
    "X_test =  Test.loc[:,regressors + [x+'_lag1' for x in features_to_engineer]]\n",
    "y_test = Test['confirmed']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Days', 'suspected', 'confirmed_lag1'], dtype='object')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.columns # regressor within model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use StandardScaler to transform numerical data \n",
    "# Note: truning sc based on X_train\n",
    "sc = StandardScaler().fit(X_train)\n",
    "X_train_sc = sc.transform(X_train)\n",
    "#X_validate_numerical_sc = sc.transform(X_validate_numerical)\n",
    "X_test_sc = sc.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.66666667, -1.0496374 , -0.8251839 ],\n",
       "       [-1.53333333, -1.0496374 , -0.82174408],\n",
       "       [-1.4       , -1.0496374 , -0.82140848]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_sc[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Linear regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Linear Regression MAPE on Validation set : 0.056702261703221964\n",
      "Linear Regression MAE on Validation set : 3457.6388138600546\n",
      "Linear Regression R2 on Validation set : 0.7091787851793272\n"
     ]
    }
   ],
   "source": [
    "# Fit the linear regression\n",
    "linear = LinearRegression()\n",
    "linear.fit(X_train_sc, y_train)\n",
    "\n",
    "# # Find the validation score\n",
    "get_validation_score(linear, X_test_sc, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LASSO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "basic alpha : 60.69496136740685\n"
     ]
    }
   ],
   "source": [
    "# Find from default hyperparameters \n",
    "lasso = LassoCV(cv=10)\n",
    "lasso.fit(X_train_sc, y_train)\n",
    "alpha = lasso.alpha_\n",
    "print(\"basic alpha :\", alpha)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Try again for more precision with alphas centered around 60.69496136740685\n",
      "Best alpha : 54.625465230666165\n"
     ]
    }
   ],
   "source": [
    "# Find from more refined list\n",
    "print(\"Try again for more precision with alphas centered around \" + str(alpha))\n",
    "lasso = LassoCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, \n",
    "                          alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, \n",
    "                          alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, \n",
    "                          alpha * 1.4], \n",
    "                max_iter = 50000, cv = 5)\n",
    "lasso.fit(X_train_sc, y_train)\n",
    "alpha = lasso.alpha_\n",
    "print(\"Best alpha :\", alpha)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Linear Regression MAPE on Validation set : 0.058677786581701134\n",
      "Linear Regression MAE on Validation set : 3579.3900118362144\n",
      "Linear Regression R2 on Validation set : 0.6995322154670371\n"
     ]
    }
   ],
   "source": [
    "# # Find the validation score\n",
    "get_validation_score(lasso, X_test_sc, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ridge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "basic alpha : 0.1\n"
     ]
    }
   ],
   "source": [
    "# Find from default hyperparameters \n",
    "ridge = RidgeCV(cv=5)\n",
    "ridge.fit(X_train_sc, y_train)\n",
    "alpha = ridge.alpha_\n",
    "print(\"basic alpha :\", alpha)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Try again for more precision with alphas centered around 0.1\n",
      "Best alpha : 0.06\n"
     ]
    }
   ],
   "source": [
    "# Find from more refined list\n",
    "print(\"Try again for more precision with alphas centered around \" + str(alpha))\n",
    "ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, \n",
    "                          alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, \n",
    "                          alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, \n",
    "                          alpha * 1.4], cv = 5)\n",
    "ridge.fit(X_train_sc, y_train)\n",
    "alpha = ridge.alpha_\n",
    "print(\"Best alpha :\", alpha)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Linear Regression MAPE on Validation set : 0.06345581028984114\n",
      "Linear Regression MAE on Validation set : 3870.098410464074\n",
      "Linear Regression R2 on Validation set : 0.67935080063222\n"
     ]
    }
   ],
   "source": [
    "# # Find the validation score\n",
    "get_validation_score(ridge, X_test_sc, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save prediction result to CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_index = X_train.index\n",
    "test_index = X_test.index\n",
    "\n",
    "# Create dataframe\n",
    "X_train = pd.DataFrame(X_train_sc, columns=X_train.columns, index=train_index)\n",
    "#X_validate_numerical_sc = pd.DataFrame(X_validate_numerical_sc, columns=numerical, index=validate_index)\n",
    "X_test = pd.DataFrame(X_test_sc, columns=X_train.columns, index=test_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'ridge_prediction/residuals_ridge_test_0211.csv'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-21-f9a1f09c9aca>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[0;31m# Write to a csv file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mdf_residuals_ridge\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ridge_prediction/residuals_ridge_test_0211.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_label\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'test_index'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/miniconda3/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mto_csv\u001b[0;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, decimal)\u001b[0m\n\u001b[1;32m   3227\u001b[0m             \u001b[0mdecimal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdecimal\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3228\u001b[0m         )\n\u001b[0;32m-> 3229\u001b[0;31m         \u001b[0mformatter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3230\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3231\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mpath_or_buf\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda3/lib/python3.7/site-packages/pandas/io/formats/csvs.py\u001b[0m in \u001b[0;36msave\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    181\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    182\u001b[0m                 \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 183\u001b[0;31m                 \u001b[0mcompression\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    184\u001b[0m             )\n\u001b[1;32m    185\u001b[0m             \u001b[0mclose\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda3/lib/python3.7/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36m_get_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text)\u001b[0m\n\u001b[1;32m    395\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    396\u001b[0m             \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 397\u001b[0;31m             \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_or_buf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnewline\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    398\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mis_text\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    399\u001b[0m             \u001b[0;31m# No explicit encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'ridge_prediction/residuals_ridge_test_0211.csv'"
     ]
    }
   ],
   "source": [
    "# Calculate the predictions \n",
    "predictions_ridge = np.squeeze(ridge.predict(X_test))\n",
    "\n",
    "# Calculate the residuals\n",
    "residuals_ridge = np.squeeze(y_test) - predictions_ridge\n",
    "\n",
    "# Create dataframe\n",
    "d = {'y_test': np.squeeze(y_test), 'predictions': predictions_ridge, 'residuals_ridge': residuals_ridge}\n",
    "df_residuals_ridge = pd.DataFrame(data=d, index=test_index)\n",
    "\n",
    "# Left join the period\n",
    "df_residuals_ridge['date'] = Test['date']\n",
    "\n",
    "df_residuals_ridge.shape\n",
    "\n",
    "# Write to a csv file\n",
    "df_residuals_ridge.to_csv('ridge_prediction/residuals_ridge_test_0211.csv', index_label='test_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the predictions \n",
    "predictions_ridge = np.squeeze(ridge.predict(X_train))\n",
    "\n",
    "# Calculate the residuals\n",
    "residuals_ridge = np.squeeze(y_train) - predictions_ridge\n",
    "\n",
    "# Create dataframe\n",
    "d = {'y_train': np.squeeze(y_train), 'predictions': predictions_ridge, 'residuals_ridge': residuals_ridge}\n",
    "df_residuals_ridge = pd.DataFrame(data=d, index=train_index)\n",
    "\n",
    "# Left join the period\n",
    "df_residuals_ridge['date'] = Train['date']\n",
    "\n",
    "# Write to a csv file\n",
    "df_residuals_ridge.to_csv('ridge_prediction/residuals_ridge_train_0211.csv', index_label='train_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_residuals_ridge.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ridge Residual Dignostic\n",
    "\n",
    "Since ridge is the optimal model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.jointplot(\"y_train\", \"residuals_ridge\", data=df_residuals_ridge, kind=\"reg\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Ridge model prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ridge_res_train = pd.read_csv('ridge_prediction/residuals_ridge_train_0211.csv')\n",
    "del ridge_res_train['train_index']\n",
    "ridge_res_test = pd.read_csv('ridge_prediction/residuals_ridge_test_0211.csv')\n",
    "del ridge_res_test['test_index']\n",
    "\n",
    "ridge_res_test.Timestamp = pd.to_datetime(ridge_res_test.date,format='%Y-%m-%d %H:%M') \n",
    "ridge_res_test.index = ridge_res_test.Timestamp\n",
    "ridge_res_test = ridge_res_test.sort_index()\n",
    "\n",
    "ridge_res_train.Timestamp = pd.to_datetime(ridge_res_train.date,format='%Y-%m-%d %H:%M') \n",
    "ridge_res_train.index = ridge_res_train.Timestamp\n",
    "ridge_res_train = ridge_res_train.sort_index()\n",
    "\n",
    "ridge_all = pd.concat([ridge_res_train,ridge_res_test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Plotting data\n",
    "ridge_res_train['predictions'].plot(figsize=(17,6), title= 'Confirmed Cases Forecast (China total)', fontsize=10,\n",
    "                                        label= 'Fitted value',linewidth=1, color = 'grey')\n",
    "ridge_res_train['y_train'].plot(figsize=(17,6),label= 'Observation',linewidth=1, color = 'steelblue')\n",
    "ridge_res_test['y_test'].plot(figsize=(17,6), label= 'Observation', linewidth=1, color = 'steelblue')\n",
    "\n",
    "ridge_res_test['predictions'].plot(figsize=(17,6), label= 'Ridge forecasting', color = 'tomato',linewidth=1)\n",
    "plt.ylabel('Confirmed cases')\n",
    "plt.xlabel('')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
